Q1 Solution

Author

Amit Chawla

if (!require("pacman")) 
  install.packages("pacman")

pacman::p_load(tidyverse,
               janitor,
               scales,
               stringr,
               ggthemes,
               ggrepel,
               patchwork,
               grid)

# set theme for ggplot2
ggplot2::theme_set(ggplot2::theme_minimal(base_size = 20))

# set width of code output
options(width = 65)

# set figure parameters for knitr
knitr::opts_chunk$set(
  fig.width = 11, # 7" width
  fig.asp = 0.618, # the golden ratio
  fig.retina = 3, # dpi multiplier for displaying HTML output on retina
  fig.align = "center", # center align figures
  dpi = 300 # higher dpi, sharper image
)
tuesdata <- tidytuesdayR::tt_load('2025-01-21')

exped_tidy <- tuesdata$exped_tidy
peaks_tidy <- tuesdata$peaks_tidy


exped_tidy_cleaned <- clean_names(exped_tidy)
peaks_tidy_cleaned <- clean_names(peaks_tidy)

pivoted_data <- exped_tidy_cleaned |>
  pivot_longer(
    cols = c(route1, route2, route3, route4, success1, success2, success3, success4),
    names_to = c(".value", "index"),
    names_pattern = "(route|success)(\\d+)"
  )

pivoted_data <- pivoted_data |>
  filter(!is.na(route))

pivoted_data <- pivoted_data |>
  mutate(route = str_replace_all(route, "S Col SE Ridge", "S Col-SE Ridge"))

summary_data <- pivoted_data |>
  group_by(peakid) |>
  summarize(
    attempts = n(),
    success_rate = (mean(success, na.rm = TRUE) * 100), # This remains NUMERIC for fill aesthetic
    .groups = 'drop' # Good practice to drop grouping after summarize
  ) |>
  arrange(desc(attempts)) |>
  slice_head(n = 4)

# Create a SEPARATE column for the label, formatted as a character string
summary_data$success_rate_label <- sprintf("%.1f%%", summary_data$success_rate)

# Join with peaks_tidy_cleaned to get peak names, keeping original peakid
summary_data <- summary_data |>
  left_join(select(peaks_tidy_cleaned, peakid, pkname), by = "peakid")

ggplot(summary_data, aes(x = reorder(pkname, -attempts), y = attempts, fill = success_rate)) +
  geom_col() +
  geom_text(aes(label = success_rate_label), # Use the new character column for labels
            vjust = 1, # Adjust vertical position if needed
            nudge_y = -3,
            color = "black") + # Choose a good color for readability
  scale_fill_viridis_c(
    option = "viridis", # Corrected 'options' to 'option'
    name = "Success Rate (%)",
    limits = c(0, 100),
    labels = function(x) paste0(x, "%")
  ) +
  coord_cartesian(ylim = c(0, 200)) +
  labs(
    x = "Peaks",
    y = "Number of Attempts",
    title = "Top 4 Peaks by Number of Attempts by all Nations",
    subtitle = "Bar fill indicates success rate (%)",
    caption = "Source: https://github.com/rfordatascience/tidytuesday"
  ) +
  theme_minimal(base_size = 14)

selected_peaks <- c("EVER", "AMAD", "LHOT", "MANA", "HIML")

pivoted_data_cleaned <- pivoted_data |>
  filter(!is.na(route) & !is.na(nation))

summary_data_with_combined_key <- pivoted_data_cleaned |>
  group_by(peakid, route, nation) |>
  summarize(
    attempts_on_route_per_nation = n(),
    success_rate_on_route = (mean(success, na.rm = TRUE) * 100),
    .groups = 'drop'
  ) |>
  mutate(peakid_route = paste(peakid, route, sep = " - ")) |>
  arrange(peakid, route, desc(attempts_on_route_per_nation))

ever_df <- summary_data_with_combined_key |>
  filter(peakid == 'EVER')

# Calculate total attempts per nation across all routes
total_attempts <- ever_df |>
  group_by(nation)|>
  summarise(total_attempts = sum(attempts_on_route_per_nation)) |>
  slice_max(order_by = total_attempts, n = 3)

# Get the top 3 nations
top_nations <- total_attempts$nation

# Filter the dataframe to include only the top 3 nations
ever_top3 <- ever_df |> 
  filter(nation %in% top_nations)

# Create the bubble chart
p1 <- ggplot(ever_top3, aes(x = route, y = success_rate_on_route)) +
  geom_point(aes(color = attempts_on_route_per_nation, size = attempts_on_route_per_nation),
             alpha = 0.7,
             position = "identity") +
  geom_text_repel(data = subset(ever_top3, attempts_on_route_per_nation >= 1),
                  aes(label = nation, color = attempts_on_route_per_nation),
                  size = 5,
                  box.padding = 0.7,
                  point.padding = 0.6,
                  min.segment.length = Inf) +
  scale_color_viridis_c(option = "turbo",
                        name = "Attempts on Route",
                        breaks = c(1, 10, 20, 30, 40),
                        limits = c(0, 40),
                        guide = "none") +
  scale_size_continuous(range = c(3, 15),
                        name = "Attempts on Route",
                        breaks = c(10, 20, 30),
                        limits = c(0, 40),
                        guide = "none")+
  annotate("text", y = 125, x = 0.7, label = "Everest", size = 5, fontface = "bold") +
  labs(x = NULL,
       y = "Success Rate (%)") +
  scale_y_continuous(breaks = seq(0, 100, by = 25)) +
  scale_x_discrete(labels = label_wrap(10)) +
  coord_cartesian(ylim = c(-5, 125)) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(hjust = 0.5, size = 9),
    legend.position = "none"
  )
amad_df <- summary_data_with_combined_key |>
  filter(peakid == 'AMAD')

# Calculate total attempts per nation across all routes
total_attempts <- amad_df |>
  group_by(nation)|>
  summarise(total_attempts = sum(attempts_on_route_per_nation)) |>
  slice_max(order_by = total_attempts, n = 3)

# Get the top 3 nations
top_nations <- total_attempts$nation

# Filter the dataframe to include only the top 3 nations
amad_top3 <- amad_df |> 
  filter(nation %in% top_nations)

print(amad_top3)
# A tibble: 5 × 6
  peakid route    nation attempts_on_route_per_nation
  <chr>  <chr>    <chr>                         <int>
1 AMAD   N Ridge  USA                               1
2 AMAD   SW Ridge USA                              23
3 AMAD   SW Ridge UK                               21
4 AMAD   SW Ridge Nepal                            11
5 AMAD   W Face   Nepal                             1
# ℹ 2 more variables: success_rate_on_route <dbl>,
#   peakid_route <chr>
# Create the bubble chart
p2 <- ggplot(amad_top3, aes(x = route, y = success_rate_on_route)) +
  geom_point(aes(color = attempts_on_route_per_nation, size = attempts_on_route_per_nation),
             alpha = 0.7,
             position = position_jitter(width = 0)) +
  geom_text_repel(data = subset(amad_top3, attempts_on_route_per_nation >= 1),
                  aes(label = nation, color = attempts_on_route_per_nation),
                  size = 5,
                  box.padding = 0.6,
                  point.padding = 0.8,
                  min.segment.length = Inf) +
  scale_color_viridis_c(option = "turbo",
                        name = "Attempts on Route",
                        breaks = c(1, 10, 20, 30, 40),
                        limits = c(0, 40),
                        guide = "none") +
  scale_size_continuous(range = c(3, 15),
                        name = "Attempts on Route",
                        breaks = c(10, 20, 30),
                        limits = c(0, 40),
                        guide = "none") +
  annotate("text", y = 120, x = 0.9, label = "Ama Dablam", size = 5, fontface = "bold") +
  labs(x = NULL,
       y = NULL) +
  scale_y_continuous(breaks = seq(0, 100, by = 25)) + # Adjusted seq start to 0 for clarity
  scale_x_discrete(labels = label_wrap(10)) +
  coord_cartesian(ylim = c(-5, 120)) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(hjust = 0.5, size = 9),
    legend.position = "none"
  )
lhot_df <- summary_data_with_combined_key |>
  filter(peakid == 'LHOT')

# Calculate total attempts per nation across all routes
total_attempts <- lhot_df |>
  group_by(nation)|>
  summarise(total_attempts = sum(attempts_on_route_per_nation)) |>
  slice_max(order_by = total_attempts, n = 3)

# Get the top 3 nations
top_nations <- total_attempts$nation

# Filter the dataframe to include only the top 3 nations
lhot_top3 <- lhot_df |> 
  filter(nation %in% top_nations)

print(lhot_top3)
# A tibble: 6 × 6
  peakid route        nation attempts_on_route_per_nation
  <chr>  <chr>        <chr>                         <int>
1 LHOT   S Col-W Face USA                              11
2 LHOT   S Col-W Face India                             7
3 LHOT   S Col-W Face Nepal                             6
4 LHOT   W Face       Nepal                            11
5 LHOT   W Face       India                            10
6 LHOT   W Face       USA                               9
# ℹ 2 more variables: success_rate_on_route <dbl>,
#   peakid_route <chr>
# Create the bubble chart
p3 <- ggplot(lhot_top3, aes(x = route, y = success_rate_on_route)) +
  geom_point(aes(color = attempts_on_route_per_nation, size = attempts_on_route_per_nation),
             alpha = 0.6,
             position = "identity") +
  geom_text_repel(data = subset(lhot_top3, attempts_on_route_per_nation >= 1),
                  aes(label = nation, color = attempts_on_route_per_nation),
                  size = 5,
                  box.padding = 0.6,
                  point.padding = 0.8,
                  min.segment.length = Inf,
                  position = "identity") +
  scale_color_viridis_c(option = "turbo",
                        name = "Attempts on Route",
                        breaks = c(1, 10, 20, 30, 40),
                        limits = c(0, 40),
                        guide = "none") +
  scale_size_continuous(range = c(3, 15),
                        name = "Attempts on Route",
                        breaks = c(10, 20, 30),
                        limits = c(0, 40),
                        guide = "none")+
  annotate("text", y = 130, x = 0.6, label = "Lhotse", size = 5, fontface = "bold") +
  labs(x = "Route",
       y = "Success Rate (%)") +
  scale_y_continuous(breaks = seq(0, 100, by = 25)) +
  scale_x_discrete(labels = label_wrap(10)) +
  coord_cartesian(ylim = c(-5, 130)) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(hjust = 0.5, size = 9),
    legend.position = "none"
  )
mana_df <- summary_data_with_combined_key |>
  filter(peakid == 'MANA')

# Calculate total attempts per nation across all routes
total_attempts <- mana_df |>
  group_by(nation)|>
  summarise(total_attempts = sum(attempts_on_route_per_nation)) |>
  slice_max(order_by = total_attempts, n = 3)

# Get the top 3 nations
top_nations <- total_attempts$nation

# Filter the dataframe to include only the top 3 nations
mana_top3 <- mana_df |> 
  filter(nation %in% top_nations)

print(mana_top3)
# A tibble: 4 × 6
  peakid route   nation attempts_on_route_per_nation
  <chr>  <chr>   <chr>                         <int>
1 MANA   NE Face Nepal                            20
2 MANA   NE Face USA                              10
3 MANA   NE Face China                             6
4 MANA   NE Face UK                                6
# ℹ 2 more variables: success_rate_on_route <dbl>,
#   peakid_route <chr>
# Create the bubble chart
p4 <- ggplot(mana_top3, aes(x = route, y = success_rate_on_route)) +
  geom_point(aes(color = attempts_on_route_per_nation, size = attempts_on_route_per_nation),
             alpha = 0.7,
             position = "identity") +
  geom_text_repel(data = subset(mana_top3, attempts_on_route_per_nation >= 1),
                  aes(label = nation, color = attempts_on_route_per_nation),
                  size = 5,
                  box.padding = 0.6,
                  point.padding = 0.8,
                  min.segment.length = Inf,
                  position = "identity") +
  scale_color_viridis_c(option = "turbo",
                        name = "\n",
                        breaks = c(1, 10, 20, 30, 40),
                        limits = c(0, 40),
                        guide = guide_colorbar(direction = "horizontal", title.position = "top")) +
  scale_size_continuous(range = c(3, 15),
                        name = "                       Attempts on route metrix (size + color)",
                        breaks = c(1, 10, 20, 30, 40),
                        limits = c(0, 40),
                        guide = guide_legend(title.position = "top"))+
  annotate("text", y = 120, x = 0.55, label = "Manaslu", size = 5, fontface = "bold") +
  labs (x = "Route",
       y = NULL) +
  scale_y_continuous(breaks = seq(0, 100, by = 25)) +
  scale_x_discrete(labels = label_wrap(10)) +
  coord_cartesian(ylim = c(-5, 120)) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(hjust = 0.5, size = 9),
    legend.position = "none"
  )
# Combine plots with patchwork
combined_plot <- (p1 + p2) / (p3 + p4) +
  plot_layout(guides = "collect") +
  plot_annotation(
    title = "National Route Preferences and Success Rates\nin High-Altitude Peak Expeditions (2020-2024)",
    subtitle = "Bubble Size and Color Show Attempts, with Success Rates\nfor Top 3 Nations Across Four Most Popular Peaks",
    caption = "Source: https://github.com/rfordatascience/tidytuesday",
    theme = theme(
      plot.title = element_text(face = "bold", size = 18, hjust = 0.2),
      plot.subtitle = element_text(size = 14, hjust = 0.2),
      plot.caption = element_text(size = 14)
    )
  ) &
  theme(
    legend.position = "bottom",
    legend.box = "horizontal",
    legend.title.align = 0.5,
    #legend.margin = margin(t = 5, b = 5),
    legend.title = element_text(size = 14)
  )

plot(combined_plot)